Pygraphistry Viz



In [1]:

    
# Imports
import graphistry
import numpy as np
import pandas as pd
from py2neo import Graph, Path

graphistry.register(key='48a82a78fdd442482cec24fe06051c905e2a382d581852a4ba645927c736acbcfe7256e22873a5c97cff6b8bd37c836b')

============================



In [41]:

    
# Static - Connect to the database
# graph = Graph('http://neo4j:nakama@ec2-34-212-133-23.us-west-2.compute.amazonaws.com:7474')



In [42]:

    
# tx = graph.cypher.begin()
# for name in ["Alice", "Bob", "Carol"]:
#     tx.append("CREATE (person:Person {name:{name}}) RETURN person", name=name)
# alice, bob, carol = [result.one for result in tx.commit()]

# friends = Path(alice, "KNOWS", bob, "KNOWS", carol)
# graph.create(friends)



In [43]:

    
# graph.data("MATCH (a:address) --> (b:incoming_payment) --> (c:transaction) RETURN  LIMIT 25")



In [44]:

    
# rows = pandas.read_csv('transactions.csv')[:1000]
# graphistry.hypergraph(rows)['graph'].plot()



In [45]:

    
# Retrieve all the paper metadata
# btc_metadata = pd.read_sql_query('SELECT * FROM Papers', conn)

# df = pd.DataFrame(graph.data("MATCH (n:transaction) Return n LIMIT 25"))

df.head()

Retrieve citations data

citations = pd.read_csv('citations.txt', names = ['source', 'target', 'label'])

Dedupe Citations

citations = citations.drop_duplicates(subset=['source', 'target'])

Clean Citations IDs

citations['target'] = citations['target'].str.strip('.') citations['source'] = citations['source'].astype(str).str.strip('.')

Unique subjects

subjects = arxiv_metadata.primary_subject.unique() subject_colors = dict(zip(subjects, range(0, len(subjects)))) arxiv_metadata['color'] = arxiv_metadata.primary_subject.map(lambda x: subject_colors[x])

citations.info()

metadata_merge = citations.merge(arxiv_metadata, left_on='source', right_on='id').merge(arxiv_metadata, left_on='target', right_on='id', suffixes=('_from', '_to'))

metadata_merge.info()

citations = pd.read_csv('Projects/ArXiv/data/citations/citations.txt', names = ['source', 'target', 'label'])

# links = pd.read_csv('./lesmiserables.csv')

citations.head()

Set up the plotter

plotter = graphistry.bind(source="source", destination="target")

plotter.plot(citations)

citations["label"] = citations.value.map(lambda v: "#Meetings: %d" % v)

plotter = plotter.bind(edge_weight="label")

plotter.plot(citations)

Set up igraph for easy metadata etc

ig = plotter.pandas2igraph(citations)

ig = plotter.pandas2igraph(metadata_merge)

Add the Arxiv Metadata

vertex_metadata = pd.DataFrame(ig.vs['nodeid'], columns=['id']).merge(arxiv_metadata, how='left', on='id') ig.vs['primary_subject'] = vertex_metadata['primary_subject'] ig.vs['color'] = vertex_metadata['color'] ig.vs['title'] = vertex_metadata['title'] ig.vs['year'] = vertex_metadata['year'] ig.vs['month'] = vertex_metadata['month'] ig.vs['category'] = vertex_metadata['category']

ig.vs['pagerank'] = ig.pagerank()

ig.vs['community'] = ig.community_infomap().membership

ig.vs['in_degree'] = ig.indegree() plotter.bind(point_size='in_degree', point_color='color').plot(ig)

plotter.bind(point_color='community', point_size='pagerank').plot(ig)



In [ ]:

Silk Road Bitcoin Embezzling Visualization



In [2]:

    
transactions = pd.read_csv('transactions.csv')
transactions['Date'] = pd.to_datetime(transactions['Date'],unit='ms') #coerce date format
transactions[:3]









    Out[2]:







  
    
      
      Amount $
      Date
      Destination
      Source
      Transaction ID
      isTainted
    
  
  
    
      0
      3223.9752
      2013-11-23 20:53:20
      84a0b53e1ac008b8dd0fd6212d4b7fa2...
      2dd13954e18508bb8b3a41d96a022be9...
      b6eb8ba20df31fa74fbe7755f58c18f82a599d6bb5fa79...
      0
    
    
      1
      3708.0216
      2014-05-31 01:33:20
      3b62a891b99969042d4e6ac8158d0a18...
      7c74d3afb41e536e26948a1d2455a7c7...
      60df3c67063e136a0c9715edcd12ae717e6f9ed492afe2...
      0
    
    
      2
      2.4800
      2014-04-27 00:53:20
      3b62a891b99969042d4e6ac8158d0a18...
      50dced19b8ee41114916bf3ca894f455...
      a6aafd3d85600844536b8a5f2c255686c33dc4969e68a4...
      0



In [3]:

    
print('DataFrame headers: {}' .format(list(transactions.columns)))









    



DataFrame headers: ['Amount $', 'Date', 'Destination', 'Source', 'Transaction ID', 'isTainted']



In [4]:

    
transactions.columns[-1]









    Out[4]:





'isTainted'



In [5]:

    
# 'taint' is weighted as 5
transactions['isTainted'].unique()









    Out[5]:





array([0, 5])



In [6]:

    
# for item in transactions[transactions['isTainted'] == 5].isTainted:
#     item = 10



In [7]:

    
# for column in transactions.columns[-1]:
#     transactions[transactions == 5] = 10



In [8]:

    
transactions.shape









    Out[8]:





(45117, 6)



In [9]:

    
transactions.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45117 entries, 0 to 45116
Data columns (total 6 columns):
Amount $          45117 non-null float64
Date              45117 non-null datetime64[ns]
Destination       45117 non-null object
Source            45117 non-null object
Transaction ID    45117 non-null object
isTainted         45117 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 2.1+ MB



In [10]:

    
# transaction window
print(transactions['Date'].sort_values().head(1), '\n')
print(transactions['Date'].sort_values().tail(1))









    



23469   2013-09-01 01:46:40
Name: Date, dtype: datetime64[ns] 

2403   2014-08-22 15:06:40
Name: Date, dtype: datetime64[ns]

Visualization 1: Quick Visualization & Analysis

Task: Spot the embezzling

Use the histogram tool to filter for only tainted transactions
Turn on the Setting "Prune Isolated Nodes" to hide wallets with no remaining transactions
Use the filters or excludes tool to only show transactions over 1000 or 1000.
Verify that money flowed from Ross Ulbricht to Carl Force, and explore where else it flowed.



In [11]:

    
g = graphistry.edges(transactions).bind(source='Source', destination='Destination')



In [12]:

    
g.plot()









    



/Users/eastblue/anaconda/lib/python3.6/site-packages/graphistry/pygraphistry.py:466: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  elif isinstance(obj, pandas.tslib.NaTType):






    Out[12]:

Visualization 2: Summarizing Wallets



In [13]:

    
# Compute how much wallets received in new df 'wallet_in'
wallet_in = transactions\
.groupby('Destination')\
.agg({'isTainted': lambda x: 1 if x.sum() > 0 else 0, 'Amount $': np.sum})\
.reset_index().rename(columns={'Destination': 'wallet', 'isTainted': 'isTaintedWallet'})
# rename destination to wallet
# rename isTainted to isTaintedWallet

#not all wallets received money, tag these
wallet_in['Receivables'] = True

wallet_in[:3]









    Out[13]:







  
    
      
      wallet
      isTaintedWallet
      Amount $
      Receivables
    
  
  
    
      0
      0002b3efbc3e742ee4cfaad18d8cf221...
      0
      41118.416840
      True
    
    
      1
      0005e0fbac078e609bbc3239d3302ff7...
      1
      5577.768000
      True
    
    
      2
      000b3df00e3ff9b7705452071c9e4e87...
      0
      11161.133824
      True



In [14]:

    
wallet_in['isTaintedWallet'].unique()









    Out[14]:





array([0, 1])



In [15]:

    
# Compute how much wallets sent in new df 'wallet_out'
wallet_out = transactions\
  .groupby('Source')\
  .agg({'isTainted': np.sum, 'Amount $': np.max})\
  .reset_index().rename(columns={'Source': 'wallet', 'isTainted': 'isTaintedWallet'})
# rename source to wallet
# rename isTainted to isTaintedWallet

#not all wallets received money, tag these
wallet_out['Payables'] = True

wallet_out[:3]









    Out[15]:







  
    
      
      wallet
      isTaintedWallet
      Amount $
      Payables
    
  
  
    
      0
      0005e0fbac078e609bbc3239d3302ff7...
      0
      6197.520000
      True
    
    
      1
      000b3df00e3ff9b7705452071c9e4e87...
      0
      857.923098
      True
    
    
      2
      0012742095ed1c2ceb334b2a5403da7d...
      0
      3472.000000
      True



In [16]:

    
wallet_out['isTaintedWallet'].unique()









    Out[16]:





array([ 0,  5, 10, 20, 15, 25, 35, 30])



In [17]:

    
# Join Data
wallets = pd.merge(wallet_in, wallet_out, how='outer')
wallets['Receivables'] = wallets['Receivables'].fillna(False)
wallets['Payables'] = wallets['Payables'].fillna(False)
print('# Wallets only sent or only received', len(wallet_in) + len(wallet_out) - len(wallets))
wallets[:3]









    



# Wallets only sent or only received 875






    Out[17]:







  
    
      
      wallet
      isTaintedWallet
      Amount $
      Receivables
      Payables
    
  
  
    
      0
      0002b3efbc3e742ee4cfaad18d8cf221...
      0
      41118.416840
      True
      False
    
    
      1
      0005e0fbac078e609bbc3239d3302ff7...
      1
      5577.768000
      True
      False
    
    
      2
      000b3df00e3ff9b7705452071c9e4e87...
      0
      11161.133824
      True
      False



In [18]:

    
tmp = wallets



In [19]:

    
# colors at: http://staging.graphistry.com/docs/legacy/api/0.9.2/palette.html#Paired
def convert_to_colors(value):
    if value == 0:
        return 36005 # magenta
    else:
        return 42005 # orange

tmp['isTaintedWallet'] = tmp['isTaintedWallet'].apply(convert_to_colors)



In [20]:

    
tmp['isTaintedWallet'].unique()









    Out[20]:





array([36005, 42005])

Plot

Bind color to whether tainted



In [24]:

    
g.nodes(tmp).bind(node='wallet', point_color='isTaintedWallet').plot()









    Out[24]:

Plain-no-audio.mov



In [ ]:

	Amount $	Date	Destination	Source	Transaction ID
0	3223.9752	2013-11-23 20:53:20	84a0b53e1ac008b8dd0fd6212d4b7fa2...	2dd13954e18508bb8b3a41d96a022be9...	b6eb8ba20df31fa74fbe7755f58c18f82a599d6bb5fa79...
1	3708.0216	2014-05-31 01:33:20	3b62a891b99969042d4e6ac8158d0a18...	7c74d3afb41e536e26948a1d2455a7c7...	60df3c67063e136a0c9715edcd12ae717e6f9ed492afe2...
2	2.4800	2014-04-27 00:53:20	3b62a891b99969042d4e6ac8158d0a18...	50dced19b8ee41114916bf3ca894f455...	a6aafd3d85600844536b8a5f2c255686c33dc4969e68a4...

	wallet	isTaintedWallet	Amount $	Receivables
0	0002b3efbc3e742ee4cfaad18d8cf221...	0	41118.416840	True
1	0005e0fbac078e609bbc3239d3302ff7...	1	5577.768000	True
2	000b3df00e3ff9b7705452071c9e4e87...	0	11161.133824	True

	wallet	Amount $	Payables
0	0005e0fbac078e609bbc3239d3302ff7...	6197.520000	True
1	000b3df00e3ff9b7705452071c9e4e87...	857.923098	True
2	0012742095ed1c2ceb334b2a5403da7d...	3472.000000	True